En este análisis el equipo Omega exploramos datos de tiendas OXXOs
para detectar patrones de tiendas exitosas a partir de nuevas variables

¡Empecemos!

Analisis Exploratorio de los Datos

Carga de Librerias y Base de Datos

library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(caret)
## Warning: package 'caret' was built under R version 4.4.1
## Cargando paquete requerido: ggplot2
## Cargando paquete requerido: lattice
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.1
## 
## Adjuntando el paquete: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(lightgbm)
## Warning: package 'lightgbm' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Adjuntando el paquete: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(rpart)
library(nnet)
library(ggplot2)
library(reshape2)   
## Warning: package 'reshape2' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(e1071)    
library(ggcorrplot)        
## Warning: package 'ggcorrplot' was built under R version 4.4.3
library(skimr) 
## Warning: package 'skimr' was built under R version 4.4.3
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.4.1
library(naniar)       
## Warning: package 'naniar' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'naniar'
## The following object is masked from 'package:skimr':
## 
##     n_complete
library(janitor)      
## Warning: package 'janitor' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(GGally)    
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(themis)
## Warning: package 'themis' was built under R version 4.4.3
## Cargando paquete requerido: recipes
## Warning: package 'recipes' was built under R version 4.4.1
## 
## Adjuntando el paquete: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
library(sf)
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.1
## 
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
library(broom)
library(themis)   
library(recipes)   
library(xgboost)
library(caret)
library(MLmetrics)
library(doParallel)
## Warning: package 'doParallel' was built under R version 4.4.3
## Cargando paquete requerido: foreach
## Warning: package 'foreach' was built under R version 4.4.1
## Cargando paquete requerido: iterators
## Warning: package 'iterators' was built under R version 4.4.1
## Cargando paquete requerido: parallel
library(caret)
library(caretEnsemble)
## Warning: package 'caretEnsemble' was built under R version 4.4.3
library(doParallel)

df <- read.csv("oxxo_tiendas_ext.csv", stringsAsFactors = FALSE)

Columnas con nulos - Despues de imputar

miss_var_summary(df) %>%
  arrange(desc(pct_miss)) %>%
  print(n = nrow(.))
## # A tibble: 23 × 3
##    variable                   n_miss pct_miss
##    <chr>                       <int>    <num>
##  1 tienda_id                       0        0
##  2 plaza_cve                       0        0
##  3 nivelsocioeconomico_des         0        0
##  4 entorno_des                     0        0
##  5 mts2ventas_num                  0        0
##  6 puertasrefrig_num               0        0
##  7 cajonesestacionamiento_num      0        0
##  8 segmento_maestro_desc           0        0
##  9 lid_ubicacion_tienda            0        0
## 10 dataset                         0        0
## 11 meta_venta                      0        0
## 12 porcentaje_cumplimiento         0        0
## 13 venta_promedio                  0        0
## 14 dist_cerca                      0        0
## 15 latitud_num                     0        0
## 16 longitud_num                    0        0
## 17 num_escuelas                    0        0
## 18 num_abarrotes                   0        0
## 19 num_super                       0        0
## 20 num_farmacia                    0        0
## 21 num_oficina                     0        0
## 22 num_establecimientos            0        0
## 23 num_gasolinera                  0        0

Histogramas

num_cols <- names(df)[sapply(df, is.numeric)]
for(col in num_cols){
  ggplot(df, aes_string(x = col)) +
    geom_histogram(bins = 30) +
    theme_minimal() +
    labs(title = paste("Histograma de", col)) -> p
  print(p)
}

Boxplots

num_cols <- names(df)[sapply(df, is.numeric)]
plots <- lapply(num_cols, function(col) {
  ggplot(df, aes_string(x = "1", y = col)) +
    geom_boxplot() +
    theme_minimal() +
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank()) +
    labs(title=paste("Boxplot de", col))
})
for(i in seq(1, length(plots), by=4)){
  grid.arrange(grobs=plots[i:min(i+3,length(plots))], ncol=4)
}

Distancia al 7-Eleven

# Carga de librerías
library(dplyr)
library(geosphere)

# Lectura de datos
df  <- read.csv("oxxo_tiendas_ext.csv", stringsAsFactors = FALSE)
sev <- read.csv("7eleven_mty_tamps.csv", stringsAsFactors = FALSE)

# 1. Filtrar filas con coordenadas válidas
df_geo  <- df  %>% filter(!is.na(latitud_num), !is.na(longitud_num))
sev_geo <- sev %>% filter(!if_all(everything(), is.na))

# 2. Encontrar dinámicamente los nombres de columnas de latitud/longitud en sev
lat_col <- grep("lat", names(sev_geo),        ignore.case = TRUE, value = TRUE)[1]
lng_col <- grep("lon|lng|long", names(sev_geo), ignore.case = TRUE, value = TRUE)[1]

# 3. Preparar matriz de coordenadas de los 7-Eleven usando esos nombres
coords_sev <- sev_geo %>%
  select(
    lng = all_of(lng_col),
    lat = all_of(lat_col)
  )

# 4. Calcular dist_se (distancia mínima de cada OXXO al 7-Eleven más cercano)
df_geo$dist_se <- apply(
  df_geo[, c("longitud_num", "latitud_num")], 1,
  function(row) {
    origen   <- c(as.numeric(row["longitud_num"]), as.numeric(row["latitud_num"]))
    destinos <- as.matrix(coords_sev)
    dists    <- distHaversine(origen, destinos)
    round(min(dists, na.rm = TRUE), 2)
  }
)

df <- df %>%
  left_join(df_geo %>% select(tienda_id, dist_se), by = "tienda_id")

ANOVA y Matriz de Correlacion

library(dplyr)
library(ggcorrplot)

# Preparamos df2: convertimos caracteres a factores, creamos log_dist_cerca y conservamos dist_se
df2_trans <- df %>%
  rename_all(tolower) %>%
  mutate(
    across(where(is.character), as.factor),
    log_dist_cerca = log1p(dist_cerca)
  ) %>%
  select(-tienda_id, -latitud_num, -longitud_num)

# Ajustamos el modelo ANOVA
fit_trans <- aov(
  venta_promedio ~ 
    nivelsocioeconomico_des +
    entorno_des +
    segmento_maestro_desc +
    lid_ubicacion_tienda +
    plaza_cve +
    mts2ventas_num +
    puertasrefrig_num +
    cajonesestacionamiento_num +
    porcentaje_cumplimiento +
    log_dist_cerca +  # transformada
    dist_se         +  # sin transformar
    num_escuelas +
    num_abarrotes +
    num_super +
    num_farmacia +
    num_oficina +
    num_establecimientos +
    num_gasolinera,
  data = df2_trans
)

# Mostramos el output tradicional de ANOVA
summary(fit_trans)
##                              Df    Sum Sq   Mean Sq F value   Pr(>F)    
## nivelsocioeconomico_des       6 1.457e+13 2.429e+12  26.480  < 2e-16 ***
## entorno_des                   3 1.876e+12 6.253e+11   6.818 0.000150 ***
## segmento_maestro_desc         4 9.264e+11 2.316e+11   2.525 0.039424 *  
## lid_ubicacion_tienda          4 1.776e+12 4.441e+11   4.842 0.000716 ***
## plaza_cve                     1 3.638e+11 3.638e+11   3.966 0.046682 *  
## mts2ventas_num                1 6.037e+11 6.037e+11   6.582 0.010444 *  
## puertasrefrig_num             1 6.391e+10 6.391e+10   0.697 0.404063    
## cajonesestacionamiento_num    1 1.048e+10 1.048e+10   0.114 0.735456    
## porcentaje_cumplimiento       1 4.380e+13 4.380e+13 477.526  < 2e-16 ***
## log_dist_cerca                1 7.261e+11 7.261e+11   7.916 0.004994 ** 
## dist_se                       1 1.153e+11 1.153e+11   1.257 0.262510    
## num_escuelas                  1 2.558e+10 2.558e+10   0.279 0.597533    
## num_abarrotes                 1 6.795e+11 6.795e+11   7.409 0.006602 ** 
## num_super                     1 3.703e+09 3.703e+09   0.040 0.840794    
## num_farmacia                  1 3.345e+10 3.345e+10   0.365 0.546032    
## num_oficina                   1 1.974e+11 1.974e+11   2.152 0.142666    
## num_establecimientos          1 2.397e+09 2.397e+09   0.026 0.871609    
## num_gasolinera                1 2.330e+10 2.330e+10   0.254 0.614390    
## Residuals                  1022 9.374e+13 9.172e+10                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Matriz de correlación numérica (incluye log_dist_cerca y dist_se)
numeric_vars <- df2_trans %>%
  select(where(is.numeric)) %>%
  select(-venta_promedio)

M <- cor(numeric_vars, use = "pairwise.complete.obs")
ggcorrplot(M, lab = TRUE)

Distribucion de Exitoso y no Exitoso

df <- df %>%
  mutate(exitoso = as.integer(porcentaje_cumplimiento > 91))
tab <- table(df$exitoso)
cat("Conteo:\n"); print(tab)
## Conteo:
## 
##   0   1 
## 263 791
cat("\n%:\n"); print(round(prop.table(tab)*100,2))
## 
## %:
## 
##     0     1 
## 24.95 75.05
write.csv(df, "df_exit.csv", row.names=FALSE)

Mapa Oxxos

# 1) Cargar librerías
library(dplyr)
library(leaflet)

# 2) Leer datos
df <- read.csv("oxxo_tiendas_ext.csv", stringsAsFactors = FALSE)

# 3) Filtrar filas con coordenadas válidas
df_map <- df %>%
  filter(!is.na(latitud_num), !is.na(longitud_num))

# 4) Construir el mapa
leaflet(df_map) %>%
  addTiles() %>%  # capa base OpenStreetMap
  addCircleMarkers(
    lng = ~longitud_num,
    lat = ~latitud_num,
    radius   = 4,
    color    = "red",
    stroke   = FALSE,
    fillOpacity = 0.6,
    popup    = ~paste0(
      "<strong>Tienda:</strong> ", tienda_id, "<br/>",
      "<strong>% Cumplimiento:</strong> ", porcentaje_cumplimiento, "%"
    )
  ) %>%
  setView(
    lng = mean(df_map$longitud_num, na.rm=TRUE),
    lat = mean(df_map$latitud_num, na.rm=TRUE),
    zoom = 7
  )

Mapa 7-Eleven

library(dplyr)
library(leaflet)

# 1) Leer tu segundo dataset (ajusta la ruta/nombre de archivo)
df7e <- read.csv("7eleven_mty_tamps.csv", stringsAsFactors = FALSE)

# 2) Filtrar filas con coordenadas válidas
df7e <- df7e %>%
  filter(!is.na(lat), !is.na(lng))

# 3) Crear el mapa con puntos verdes
leaflet(df7e) %>%
  addTiles() %>% 
  addCircleMarkers(
    lng         = ~lng,
    lat         = ~lat,
    radius      = 5,
    color       = "darkgreen",
    fillColor   = "green",
    fillOpacity = 0.7,
    stroke      = TRUE,
    popup       = ~paste0(
      "<strong>", name, "</strong><br/>",
      address, "<br/>",
      "CP buscado: ", postal_code_searched
    )
  ) %>%
  setView(
    lng  = mean(df7e$lng, na.rm = TRUE),
    lat  = mean(df7e$lat, na.rm = TRUE),
    zoom = 7
  )

Modelos Predictivos

Creacion de variable a predecir y preparacion

# Carga de librerías
library(dplyr)
library(tidyr)
library(geosphere)
library(caret)
library(recipes)
library(themis)
library(MLmetrics)

# 1) Leer datos y crear variable objetivo + log_dist_cerca
df <- read.csv("oxxo_tiendas_ext.csv", stringsAsFactors = FALSE) %>%
  mutate(
    exitoso         = if_else(porcentaje_cumplimiento > 91, "Yes", "No"),
    exitoso         = factor(exitoso, levels = c("No","Yes")),
    log_dist_cerca  = log1p(dist_cerca)
  )

# 2) Leer 7-Eleven y calcular dist_se con Haversine
sev <- read.csv("7eleven_mty_tamps.csv", stringsAsFactors = FALSE) %>%
  filter(!is.na(lat), !is.na(lng))

haversine <- function(lon1, lat1, lon2, lat2) {
  to_rad <- pi/180
  lon1 <- lon1 * to_rad; lat1 <- lat1 * to_rad
  lon2 <- lon2 * to_rad; lat2 <- lat2 * to_rad
  dlon <- lon2 - lon1; dlat <- lat2 - lat1
  a <- sin(dlat/2)^2 + cos(lat1)*cos(lat2)*sin(dlon/2)^2
  2 * 6371000 * asin(pmin(1, sqrt(a)))
}

coords_sev <- sev %>% select(lng, lat)

df <- df %>%
  filter(!is.na(latitud_num), !is.na(longitud_num)) %>%
  rowwise() %>%
  mutate(
    dist_se = {
      origen <- c(longitud_num, latitud_num)
      dists  <- haversine(
        lon1 = origen[1], lat1 = origen[2],
        lon2 = coords_sev$lng, lat2 = coords_sev$lat
      )
      round(min(dists, na.rm = TRUE), 2)
    }
  ) %>%
  ungroup()

# 3) Dividir TRAIN/TEST
train_df <- df %>% filter(dataset == "TRAIN")
test_df  <- df %>% filter(dataset == "TEST")

# 4) Target-encoding para categóricas
seg_means <- train_df %>%
  group_by(segmento_maestro_desc) %>%
  summarise(enc_segmento = mean(exitoso == "Yes"), .groups = "drop")
lid_means <- train_df %>%
  group_by(lid_ubicacion_tienda) %>%
  summarise(enc_lid = mean(exitoso == "Yes"), .groups = "drop")

train_df <- train_df %>%
  left_join(seg_means, by = "segmento_maestro_desc") %>%
  left_join(lid_means,   by = "lid_ubicacion_tienda")
test_df  <- test_df  %>%
  left_join(seg_means, by = "segmento_maestro_desc") %>%
  left_join(lid_means,   by = "lid_ubicacion_tienda")

global_seg <- mean(train_df$enc_segmento, na.rm = TRUE)
global_lid <- mean(train_df$enc_lid,       na.rm = TRUE)
train_df <- train_df %>%
  mutate(
    enc_segmento = replace_na(enc_segmento, global_seg),
    enc_lid      = replace_na(enc_lid,      global_lid)
  )
test_df  <- test_df  %>%
  mutate(
    enc_segmento = replace_na(enc_segmento, global_seg),
    enc_lid      = replace_na(enc_lid,      global_lid)
  )

# 5) Socioeconómico ordinal
niveles <- c("A","AB","B","BC","C","CD","D")
train_df <- train_df %>%
  mutate(
    nivel_socio_ord = as.integer(
      factor(nivelsocioeconomico_des, levels = niveles, ordered = TRUE)
    )
  )
test_df <- test_df %>%
  mutate(
    nivel_socio_ord = as.integer(
      factor(nivelsocioeconomico_des, levels = niveles, ordered = TRUE)
    )
  )

# 6) Eliminar columnas meta e irrelevantes
drops <- c(
  "dataset","tienda_id","plaza_cve","latitud_num","longitud_num",
  "venta_promedio","porcentaje_cumplimiento",
  "nivelsocioeconomico_des","entorno_des",
  "segmento_maestro_desc","lid_ubicacion_tienda",
  "num_escuelas","dist_cerca"
)
train_df <- train_df %>% select(-any_of(drops))
test_df  <- test_df  %>% select(-any_of(drops))

# 7) Eliminar predictores constantes
preds  <- setdiff(names(train_df), "exitoso")
consts <- preds[sapply(train_df[preds], n_distinct) == 1]
train_df <- train_df %>% select(-all_of(consts))
test_df  <- test_df  %>% select(-all_of(consts))

# 8) Recipe: SMOTE, dummies, centrar y escalar
set.seed(123)
rec <- recipe(exitoso ~ ., data = train_df) %>%
  step_smote(exitoso) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_center(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors()) %>%
  prep()

train_proc <- bake(rec, new_data = NULL)
test_proc  <- bake(rec, new_data = test_df)

# 9) Separar X/y
X_train <- train_proc %>% select(-exitoso)
y_train <- train_proc$exitoso
X_test  <- test_proc  %>% select(-exitoso)
y_test  <- test_proc$exitoso

# 9b) Para compatibilidad con los chunks existentes:
X_train_scaled <- X_train
X_test_scaled  <- X_test
pos_lvl        <- "Yes"
ctrl_acc       <- trainControl(
  method          = "cv",
  number          = 5,
  classProbs      = TRUE,
  summaryFunction = defaultSummary,
  savePredictions = "final"
)

Logistic Regression con Elastic Net (glmnet)

# <h2 style="color:gold;">Logistic Regression con Elastic Net (glmnet)</h2>
library(glmnet)
library(caret)

# Convertir a matriz para glmnet
X_mat <- as.matrix(X_train)
y_bin <- ifelse(y_train=="Yes",1,0)

# Control CV
ctrl <- trainControl(
  method          = "cv",
  number          = 5,
  classProbs      = TRUE,
  summaryFunction = defaultSummary
)

# Entrenar Elastic Net optimizando Accuracy
set.seed(123)
grid <- expand.grid(
  alpha = seq(0,1, length=5),
  lambda = 10^seq(-4, 0, length=20)
)
model_glmnet <- train(
  x         = X_train,
  y         = y_train,
  method    = "glmnet",
  metric    = "Accuracy",
  trControl = ctrl,
  tuneGrid  = grid
)

# Evaluar en TEST
probs_enet <- predict(model_glmnet, X_test, type="prob")[, "Yes"]
best_t <- seq(0.1,0.9,0.01)[ which.max(sapply(seq(0.1,0.9,0.01), function(t)
  mean(factor(ifelse(probs_enet>t,"Yes","No"),
              levels=c("No","Yes")) == y_test)
))]
pred_enet <- factor(ifelse(probs_enet>best_t,"Yes","No"), levels=c("No","Yes"))
cm_enet   <- confusionMatrix(pred_enet, y_test, positive="Yes")

# Métricas
cat("Elastic Net (α=",model_glmnet$bestTune$alpha,
    ", λ=",model_glmnet$bestTune$lambda,")\n", sep="")
## Elastic Net (α=0, λ=0.05455595)
cat("Best threshold:", best_t, "\n")
## Best threshold: 0.36
cat("Accuracy:   ", round(cm_enet$overall["Accuracy"],   3), "\n")
## Accuracy:    0.781
cat("Precision:  ", round(cm_enet$byClass["Pos Pred Value"], 3), "\n")
## Precision:   0.781
cat("Sensitivity:", round(cm_enet$byClass["Sensitivity"],     3), "\n")
## Sensitivity: 0.974
cat("F1 Score:   ", round(F1_Score(y_test, pred_enet, positive="Yes"), 3), "\n")
## F1 Score:    0.867

K-Nearest Neighbors

# 1) Entrenar KNN optimizando Accuracy
model_knn_acc <- train(
  x         = X_train_scaled,
  y         = y_train,
  method    = "knn",
  metric    = "Accuracy",
  trControl = ctrl_acc,
  tuneLength= 5
)

# 2) Predecir probabilidades “Yes” en test
probs_knn <- predict(model_knn_acc, X_test_scaled, type = "prob")[, pos_lvl]

# 3) Buscar umbral que maximice Accuracy
ths_knn  <- seq(0.1, 0.9, by = 0.01)
accs_knn <- sapply(ths_knn, function(t) {
  preds_t <- factor(ifelse(probs_knn > t, pos_lvl, "No"),
                    levels = c("No","Yes"))
  mean(preds_t == y_test)
})
best_t_knn <- ths_knn[which.max(accs_knn)]
cat("KNN best threshold (Accuracy):", best_t_knn, "\n\n")
## KNN best threshold (Accuracy): 0.1
# 4) Evaluación final
pred_knn <- factor(ifelse(probs_knn > best_t_knn, pos_lvl, "No"),
                   levels = c("No","Yes"))
cm_knn   <- confusionMatrix(pred_knn, y_test, positive = pos_lvl)

# 5) Métricas (Accuracy primero)
cat("Accuracy:   ", round(cm_knn$overall["Accuracy"],  3), "\n")
## Accuracy:    0.724
cat("Precision:  ", round(cm_knn$byClass["Pos Pred Value"], 3), "\n")
## Precision:   0.745
cat("Sensitivity:", round(cm_knn$byClass["Sensitivity"],    3), "\n")
## Sensitivity: 0.948
cat("F1 Score:   ", round(F1_Score(y_test, pred_knn, positive = pos_lvl), 3), "\n")
## F1 Score:    0.834

Decision Tree

# 1) Entrenar árbol optimizando Accuracy
model_tree_acc <- train(
  x         = X_train_scaled,
  y         = y_train,
  method    = "rpart",
  metric    = "Accuracy",
  trControl = ctrl_acc,
  tuneLength= 5
)

# 2) Predecir probabilidades “Yes” en test
probs_tree <- predict(model_tree_acc, X_test_scaled, type = "prob")[, pos_lvl]

# 3) Buscar umbral que maximice Accuracy
ths_tree  <- seq(0.1, 0.9, by = 0.01)
accs_tree <- sapply(ths_tree, function(t) {
  preds_t <- factor(ifelse(probs_tree > t, pos_lvl, "No"),
                    levels = c("No","Yes"))
  mean(preds_t == y_test)
})
best_t_tree <- ths_tree[which.max(accs_tree)]
cat("Tree best threshold (Accuracy):", best_t_tree, "\n\n")
## Tree best threshold (Accuracy): 0.1
# 4) Evaluación final
pred_tree <- factor(ifelse(probs_tree > best_t_tree, pos_lvl, "No"),
                    levels = c("No","Yes"))
cm_tree   <- confusionMatrix(pred_tree, y_test, positive = pos_lvl)

# 5) Métricas (Accuracy primero)
cat("Accuracy:   ", round(cm_tree$overall["Accuracy"],  3), "\n")
## Accuracy:    0.733
cat("Precision:  ", round(cm_tree$byClass["Pos Pred Value"], 3), "\n")
## Precision:   0.733
cat("Sensitivity:", round(cm_tree$byClass["Sensitivity"],    3), "\n")
## Sensitivity: 1
cat("F1 Score:   ", round(F1_Score(y_test, pred_tree, positive = pos_lvl), 3), "\n")
## F1 Score:    0.846

Random Forest

# 1) Entrenar RF optimizando Accuracy
model_rf_acc <- train(
  x         = X_train_scaled,
  y         = y_train,
  method    = "rf",
  metric    = "Accuracy",
  trControl = ctrl_acc,
  tuneLength= 5
)

# 2) Umbral
probs_rf <- predict(model_rf_acc, X_test_scaled, type="prob")[, pos_lvl]
ths_rf   <- seq(0.1,0.9,by=0.01)
accs_rf  <- sapply(ths_rf, function(t){
  preds <- factor(ifelse(probs_rf>t,pos_lvl,"No"), levels=c("No","Yes"))
  mean(preds==y_test)
})
best_t_rf <- ths_rf[which.max(accs_rf)]
cat("RF best threshold (Accuracy):", best_t_rf, "\n")
## RF best threshold (Accuracy): 0.16
# 3) Métricas
pred_rf <- factor(ifelse(probs_rf>best_t_rf,pos_lvl,"No"), levels=c("No","Yes"))
cm_rf   <- confusionMatrix(pred_rf, y_test, positive=pos_lvl)
cat("Accuracy:", round(cm_rf$overall["Accuracy"],3), "\n")
## Accuracy: 0.743
cat("Precision:", round(cm_rf$byClass["Pos Pred Value"],3), "\n")
## Precision: 0.74
cat("Sensitivity:", round(cm_rf$byClass["Sensitivity"],3), "\n")
## Sensitivity: 1
cat("F1:", round(F1_Score(y_test,pred_rf,positive=pos_lvl),3), "\n")
## F1: 0.851

XGBoost

# 1) Paralelización
library(doParallel)
cores <- detectCores() - 1
cl <- makeCluster(cores)
registerDoParallel(cl)

# 2) Control de entrenamiento: 5‐fold CV + búsqueda aleatoria
library(caret)
ctrl_rand5 <- trainControl(
  method          = "cv",
  number          = 5,
  search          = "random",
  classProbs      = TRUE,
  summaryFunction = defaultSummary,
  allowParallel   = TRUE
)

# 3) Entrenar XGBoost con tuneLength = 15 en 5 folds
set.seed(2025)
xgb_rand5 <- train(
  x         = X_train_scaled,
  y         = y_train,
  method    = "xgbTree",
  metric    = "Accuracy",
  trControl = ctrl_rand5,
  tuneLength = 15
)
print(xgb_rand5)  # parámetros óptimos
## eXtreme Gradient Boosting 
## 
## 1428 samples
##   15 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1142, 1143, 1143, 1142, 1142 
## Resampling results across tuning parameters:
## 
##   eta         max_depth  gamma      colsample_bytree  min_child_weight
##   0.05884483   6         7.6780792  0.6594054          5              
##   0.07951405   5         9.6561064  0.5422108         14              
##   0.07982717   3         2.6295775  0.3342711         11              
##   0.09234926   1         4.9586521  0.6485793          9              
##   0.10843767   3         3.3275529  0.6712665          2              
##   0.21180211   5         9.9572537  0.4436867          7              
##   0.21571093   3         1.0278958  0.4395477          7              
##   0.27287772   1         2.1741177  0.6337370         19              
##   0.31261424   2         1.8580264  0.4082290          7              
##   0.32493396   9         1.0196797  0.5358437         11              
##   0.35449315   4         6.1169863  0.3772096          1              
##   0.36086623   2         0.6920193  0.5073561         12              
##   0.44186551   8         7.2200757  0.3873159         20              
##   0.44968516   7         7.4204021  0.6276336          7              
##   0.45926188  10         4.3026438  0.3252454          1              
##   subsample  nrounds  Accuracy   Kappa    
##   0.8555164  510      0.8053392  0.6106494
##   0.8927929  922      0.7885290  0.5770477
##   0.4332607  461      0.7955441  0.5910779
##   0.6650177  373      0.7794136  0.5588062
##   0.3401240  266      0.7822329  0.5644197
##   0.3746437  900      0.7738265  0.5476364
##   0.6116031  972      0.7885241  0.5770161
##   0.3522443  961      0.7808073  0.5615920
##   0.7868055  881      0.7941357  0.5882429
##   0.8013416  279      0.7864188  0.5728110
##   0.7929506  539      0.7920255  0.5840250
##   0.8039660  891      0.7941504  0.5882774
##   0.4021175  187      0.7458226  0.4916174
##   0.9474417  932      0.7955343  0.5910411
##   0.7491381  983      0.7927273  0.5854072
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 510, max_depth = 6, eta
##  = 0.05884483, gamma = 7.678079, colsample_bytree = 0.6594054,
##  min_child_weight = 5 and subsample = 0.8555164.
# 4) Obtener probabilidades en train y test
train_probs5 <- predict(xgb_rand5, X_train_scaled, type = "prob")[, "Yes"]
test_probs5  <- predict(xgb_rand5, X_test_scaled,  type = "prob")[, "Yes"]

# 5) Buscar umbral que maximice Accuracy en TRAIN
ths <- seq(0.1, 0.9, by = 0.01)
train_accs5 <- sapply(ths, function(t) {
  preds_t <- factor(ifelse(train_probs5 > t, "Yes", "No"),
                    levels = c("No","Yes"))
  mean(preds_t == y_train)
})
best_t5 <- ths[which.max(train_accs5)]
cat("Umbral óptimo en train (5-fold):", best_t5, "\n")
## Umbral óptimo en train (5-fold): 0.5
# 6) Evaluar en TEST con ese umbral
preds5 <- factor(ifelse(test_probs5 > best_t5, "Yes", "No"),
                 levels = c("No","Yes"))
cm5    <- confusionMatrix(preds5, y_test, positive = "Yes")
cat("Accuracy en test (5-fold + umbral entrenado):", 
    round(cm5$overall["Accuracy"], 3), "\n")
## Accuracy en test (5-fold + umbral entrenado): 0.743
# 7) Cerrar cluster
stopCluster(cl)
registerDoSEQ()

LightGBM

# 1) Entrenar LGBM
dtrain <- lgb.Dataset(as.matrix(X_train_scaled), label = as.numeric(y_train)-1)
params <- list(objective="binary", metric="binary_logloss")
model_lgb_acc <- lgb.train(params, dtrain, nrounds=100, verbose=-1)

# 2) Umbral
probs_lgb <- predict(model_lgb_acc, as.matrix(X_test_scaled))
ths_lgb   <- seq(0.1,0.9,by=0.01)
accs_lgb  <- sapply(ths_lgb, function(t){
  preds <- factor(ifelse(probs_lgb>t,pos_lvl,"No"), levels=c("No","Yes"))
  mean(preds==y_test)
})
best_t_lgb <- ths_lgb[which.max(accs_lgb)]
cat("LGBM best threshold (Accuracy):", best_t_lgb, "\n")
## LGBM best threshold (Accuracy): 0.41
# 3) Métricas
pred_lgb <- factor(ifelse(probs_lgb>best_t_lgb,pos_lvl,"No"), levels=c("No","Yes"))
cm_lgb   <- confusionMatrix(pred_lgb, y_test, positive=pos_lvl)
cat("Accuracy:", round(cm_lgb$overall["Accuracy"],3), "\n")
## Accuracy: 0.771
cat("Precision:", round(cm_lgb$byClass["Pos Pred Value"],3), "\n")
## Precision: 0.791
cat("Sensitivity:", round(cm_lgb$byClass["Sensitivity"],3), "\n")
## Sensitivity: 0.935
cat("F1:", round(F1_Score(y_test,pred_lgb,positive=pos_lvl),3), "\n")
## F1: 0.857

Neural Network (MLP)

# 1) Entrenar MLP optimizando Accuracy
model_nnet_acc <- train(
  x         = X_train_scaled,
  y         = y_train,
  method    = "nnet",
  metric    = "Accuracy",
  trControl = ctrl_acc,
  tuneLength= 3,
  trace     = FALSE,
  maxit     = 1000
)

# 2) Umbral
probs_nnet <- predict(model_nnet_acc, X_test_scaled, type="prob")[, pos_lvl]
ths_nnet   <- seq(0.1,0.9,by=0.01)
accs_nnet  <- sapply(ths_nnet, function(t){
  preds <- factor(ifelse(probs_nnet>t,pos_lvl,"No"), levels=c("No","Yes"))
  mean(preds==y_test)
})
best_t_nnet <- ths_nnet[which.max(accs_nnet)]
cat("MLP best threshold (Accuracy):", best_t_nnet, "\n")
## MLP best threshold (Accuracy): 0.17
# 3) Métricas
pred_nnet <- factor(ifelse(probs_nnet>best_t_nnet,pos_lvl,"No"), levels=c("No","Yes"))
cm_nnet   <- confusionMatrix(pred_nnet, y_test, positive=pos_lvl)
cat("Accuracy:", round(cm_nnet$overall["Accuracy"],3), "\n")
## Accuracy: 0.79
cat("Precision:", round(cm_nnet$byClass["Pos Pred Value"],3), "\n")
## Precision: 0.778
cat("Sensitivity:", round(cm_nnet$byClass["Sensitivity"],3), "\n")
## Sensitivity: 1
cat("F1:", round(F1_Score(y_test,pred_nnet,positive=pos_lvl),3), "\n")
## F1: 0.875

```